IMPORT

1. Librairies

library(phyloseq) # for phyloseq object
library(ggplot2)
library(cowplot)
library(tidyverse)
library("plotly") # plot 3D
library("microbiome") # for centered log-ratio
library("coda") # Aitchison distance
library("coda.base") # Aitchison distance
library("vegan") # NMDS
library(pheatmap) # for heatmap

2. Data

# Set path
path <- "~/Projects/IBS_Meta-analysis_16S"

# Import phyloseq object
physeq.hugerth <- readRDS(file.path(path, "phyloseq-objects/physeq_hugerth.rds"))

# Sanity check
physeq.hugerth
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 5872 taxa and 525 samples ]
## sample_data() Sample Data:       [ 525 samples by 19 sample variables ]
## tax_table()   Taxonomy Table:    [ 5872 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 5872 tips and 5870 internal nodes ]
## refseq()      DNAStringSet:      [ 5872 reference sequences ]

Phylogenetic tree was computed with the package phangorn, and the script was run on a cluster. Let’s check we have correctly generated a phylogenetic tree.

# Look at the tree
plot_tree(physeq.hugerth, color = "host_disease", ladderize="left")

DEMOGRAPHICS

This dataset has several covariates (gender, age, bmi, sample_storage_duration). We will check whether there is the same distribution of these covariates between healthy and IBS patients.

# Number of individuals in each group (keeping just 1 sample per individual)
metadata <- data.frame(sample_data(physeq.hugerth)) %>%
  select(host_disease, host_bmi, host_age, host_sex, host_psy, host_ID) %>%
  group_by(host_ID) %>%
  summarise_all(first)
  
metadata %>%
  count(host_disease)
# Age
metadata %>%
  group_by(host_disease) %>%
  summarize(mean_age=mean(host_age), sd_age=sd(host_age))
wilcox.test(metadata[metadata$host_disease == "IBS", ]$host_age,
            metadata[metadata$host_disease == "Healthy", ]$host_age) # p=0.23
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata[metadata$host_disease == "IBS", ]$host_age and metadata[metadata$host_disease == "Healthy", ]$host_age
## W = 11154, p-value = 0.2311
## alternative hypothesis: true location shift is not equal to 0
# Gender
metadata %>%
  count(host_disease, host_sex)
chisq.test(data.frame("Female" = c(164,53),
                      "Male" = c(123,32))) # p=0.47
## 
##  Pearson's Chi-squared test with Yates' continuity correction
## 
## data:  data.frame(Female = c(164, 53), Male = c(123, 32))
## X-squared = 0.53372, df = 1, p-value = 0.465
# BMI
metadata %>%
  group_by(host_disease) %>%
  summarize(mean_bmi=mean(na.omit(host_bmi)), sd_bmi=sd(na.omit(host_bmi)))
wilcox.test(metadata[metadata$host_disease == "IBS",]$host_bmi,
            metadata[metadata$host_disease == "Healthy", ]$host_bmi) # 0.07
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  metadata[metadata$host_disease == "IBS", ]$host_bmi and metadata[metadata$host_disease == "Healthy", ]$host_bmi
## W = 10460, p-value = 0.06854
## alternative hypothesis: true location shift is not equal to 0
# Sample storage duration (take into account all samples per individual)
data.frame(sample_data(physeq.hugerth)) %>%
  group_by(host_disease) %>%
  summarize(mean_storage_duration = mean(sample_storage_duration),
            sd = sd(sample_storage_duration))
wilcox.test(sample_data(physeq.hugerth)[sample_data(physeq.hugerth)$host_disease == "IBS", ]$sample_storage_duration,
            sample_data(physeq.hugerth)[sample_data(physeq.hugerth)$host_disease == "Healthy", ]$sample_storage_duration) # p=0.07
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  sample_data(physeq.hugerth)[sample_data(physeq.hugerth)$host_disease == "IBS", ]$sample_storage_duration and sample_data(physeq.hugerth)[sample_data(physeq.hugerth)$host_disease == "Healthy", ]$sample_storage_duration
## W = 22042, p-value = 0.07467
## alternative hypothesis: true location shift is not equal to 0

ABUNDANCES

1. Absolute abundances

# Plot Phylum
plot_bar(physeq.hugerth, fill = "Phylum") + facet_wrap("host_disease", scales="free") +
  theme(axis.text.x = element_text(size = 5))+
  labs(x = "Samples", y = "Absolute abundance", title = "Hugerth dataset (2020)") +
  ylim(0,200000)

# Plot Class
plot_bar(physeq.hugerth, fill = "Class")+ facet_wrap("host_disease", scales="free") +
  theme(axis.text.x = element_text(size = 5))+
  labs(x = "Samples", y = "Absolute abundance", title = "Hugerth dataset (2020)")+
  ylim(0,200000)

Sequencing depth characteristics of the Hugerth dataset:
- minimum of 573 total count per sample
- median: 2.03410^{4} total count per sample
- maximum of 1.891810^{5} total count per sample

2. Relative abundances

# Agglomerate to phylum & class levels
phylum.table <- physeq.hugerth %>%
  tax_glom(taxrank = "Phylum") %>%                     # agglomerate at phylum level
  transform_sample_counts(function(x) {x/sum(x)} ) %>% # Transform to rel. abundance
  psmelt()                                             # Melt to long format

class.table <- physeq.hugerth %>%
  tax_glom(taxrank = "Class") %>%
  transform_sample_counts(function(x) {x/sum(x)} ) %>%
  psmelt()


# Plot relative abundances
ggplot(phylum.table, aes(x = reorder(Sample, Sample, function(x) mean(phylum.table[Sample == x & Phylum == 'Bacteroidota', 'Abundance'])),
                         y = Abundance, fill = Phylum))+
  facet_wrap(~ sample_type + host_disease, scales = "free") + # scales = "free" removes empty lines
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_blank())+
  labs(x = "Samples", y = "Relative abundance", title = "Hugerth dataset (2020)")

ggplot(class.table, aes(x = reorder(Sample, Sample, function(x) mean(class.table[Sample == x & Phylum == 'Bacteroidota', 'Abundance'])),
                        y = Abundance, fill = Class))+
  facet_wrap(~ sample_type + host_disease, scales = "free") +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_blank(),
        legend.key.size = unit(0.2, 'cm'),
        legend.text = element_text(size=8))+
  labs(x = "Samples", y = "Relative abundance", title = "Hugerth dataset (2020)")

3. Firmicutes/Bacteroidota ratio

# Extract abundance of only Bacteroidota and Firmicutes
relevant.covariates <- c('Sample', 'Abundance', 'host_disease', 'Phylum', 'sample_type', 'host_ID', 'host_age', 'host_sex', 'host_bmi')

bacter <- phylum.table %>%
  filter(Phylum == "Bacteroidota") %>%
  select(all_of(relevant.covariates)) %>%
  dplyr::rename(Bacteroidota = Abundance) %>%
  select(-Phylum)

firmi <- phylum.table %>%
  filter(Phylum == "Firmicutes") %>%
  select(all_of(relevant.covariates)) %>%
  dplyr::rename(Firmicutes = Abundance) %>%
  select(-Phylum)

# Calculate log2 ratio Firmicutes/Bacteroidota
ratio.FB <- left_join(x=bacter, y=firmi, by=c('Sample', 'host_disease', 'sample_type', 'host_ID', 'host_age', 'host_sex', 'host_bmi')) %>%
  relocate(Firmicutes, .after=Bacteroidota) %>%
  # Compute log ratios
  mutate(logRatioFB = log2(Firmicutes/Bacteroidota))


# Plot log2 ratio Firmicutes/Bacteroidota
ggplot(ratio.FB, aes(x = host_disease, y = logRatioFB))+
  geom_violin(aes(fill=host_disease))+
  scale_fill_manual(values=scales::alpha(c("blue", "red"), .3))+
  geom_jitter(width=0.1)+
  facet_wrap(~sample_type) +
  labs(x = "",  y = 'Log2(Firmicutes/Bacteroidota)', title = "Firmicutes:Bacteroidota ratio")+
  theme_cowplot()+
  theme(legend.position="none")

# Statistical test sigmoid samples
wilcox.test(ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "IBS","logRatioFB"],
            ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "Healthy","logRatioFB"]) # p = 0.6
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "IBS", "logRatioFB"] and ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "Healthy", "logRatioFB"]
## W = 10098, p-value = 0.5673
## alternative hypothesis: true location shift is not equal to 0
# Statistical test stool samples
wilcox.test(ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "IBS","logRatioFB"],
            ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "Healthy","logRatioFB"]) # p=0.12
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "IBS", "logRatioFB"] and ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "Healthy", "logRatioFB"]
## W = 3306, p-value = 0.1229
## alternative hypothesis: true location shift is not equal to 0
# Paired data
ggplot(ratio.FB, aes(x = sample_type, y = logRatioFB))+
  geom_violin(aes(fill=host_disease))+
  scale_fill_manual(values=scales::alpha(c("blue", "red"), .3))+
  geom_point()+
  geom_line(aes(group=host_ID), lwd=0.1) +
  facet_wrap(~host_disease) +
  labs(x = "",  y = 'Log2(Firmicutes/Bacteroidota)', title = "Paired data") +
  theme_cowplot()+
  theme(legend.position="none")

# Statistical test sigmoid vs stool samples
wilcox.test(ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "Healthy","logRatioFB"],
            ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "Healthy","logRatioFB"]) # p=0.1
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "Healthy", "logRatioFB"] and ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "Healthy", "logRatioFB"]
## W = 15957, p-value = 0.09111
## alternative hypothesis: true location shift is not equal to 0
wilcox.test(ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "IBS","logRatioFB"],
            ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "IBS","logRatioFB"]) # p=0.3
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  ratio.FB[ratio.FB$sample_type == "stool" & ratio.FB$host_disease == "IBS", "logRatioFB"] and ratio.FB[ratio.FB$sample_type == "sigmoid" & ratio.FB$host_disease == "IBS", "logRatioFB"]
## W = 1887.5, p-value = 0.2984
## alternative hypothesis: true location shift is not equal to 0

NORMALIZE DATA

# Sanity check no sample with less than 500 total count
table(sample_sums(physeq.hugerth)<500) # all FALSE

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH NON-ZERO COMPOSITIONS
physeq.NZcomp <- physeq.hugerth
otu_table(physeq.NZcomp)[otu_table(physeq.NZcomp) == 0] <- 0.5 # pseudocounts

# Sanity check that 0 values have been replaced
# otu_table(physeq.hugerth)[1:5,1:5]
# otu_table(physeq.NZcomp)[1:5,1:5]

# transform into compositions
physeq.NZcomp <- transform_sample_counts(physeq.NZcomp, function(x) x / sum(x) )
table(rowSums(otu_table(physeq.NZcomp))) # check if there is any row not summing to 1

# Save object
saveRDS(physeq.NZcomp, file.path(path, "data/analysis-individual/Hugerth-2019/02_EDA-Hugerth/physeq_NZcomp.rds"))

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH RELATIVE COUNT (BETWEEN 0 AND 1)
physeq.rel <- physeq.hugerth
physeq.rel <- transform_sample_counts(physeq.rel, function(x) x / sum(x) )

# check the counts are all relative
# otu_table(physeq.hugerth)[1:5, 1:5]
# otu_table(physeq.rel)[1:5, 1:5]

# sanity check
table(rowSums(otu_table(physeq.rel))) # check if there is any row not summing to 1

# save the physeq.rel object
saveRDS(physeq.rel, file.path(path, "data/analysis-individual/Hugerth-2019/02_EDA-Hugerth/physeq_relative.rds"))

#____________________________________________________________________
# PHYLOSEQ OBJECT WITH COMMON-SCALE NORMALIZATION
physeq.CSN <- physeq.hugerth
physeq.CSN <- transform_sample_counts(physeq.CSN, function(x) (x*min(sample_sums(physeq.CSN))) / sum(x) )

# sanity check
table(rowSums(otu_table(physeq.CSN))) # check that all rows are summing to the same total

# save the physeq.CSN object
saveRDS(physeq.CSN, file.path(path, "data/analysis-individual/Hugerth-2019/02_EDA-Hugerth/physeq_CSN.rds"))


#____________________________________________________________________
# PHYLOSEQ OBJECT WITH CENTERED LOG RATIO COUNT
physeq.clr <- physeq.hugerth
physeq.clr <- microbiome::transform(physeq.hugerth, "clr") # the function adds pseudocounts itself

# Compare the otu tables in the original phyloseq object and the new one after CLR transformation
# otu_table(physeq.hugerth)[1:5, 1:5] # should contain absolute counts
# otu_table(physeq.clr)[1:5, 1:5] # should all be relative

# save the physeq.rel object
saveRDS(physeq.clr, file.path(path, "data/analysis-individual/Hugerth-2019/02_EDA-Hugerth/physeq_clr.rds"))

COMPUTE DISTANCES

1. UniFrac, Aitchison, Bray-Curtis and Canberra

First, let’s look at these four distances of interest.

#____________________________________________________________________________________
# Measure distances
getDistances <- function(relPhyseq=physeq.rel, clrPhyseq=physeq.clr, csnPhyseq=physeq.CSN, nzcompPhyseq=physeq.NZcomp){
  # sanity check
  cat("nb samples relPhyseq:", nsamples(relPhyseq), "\n")
  cat("nb samples clrPhyseq:", nsamples(clrPhyseq), "\n")
  cat("nb samples csnPhyseq:", nsamples(csnPhyseq), "\n")
  cat("nb samples nzcompPhyseq:", nsamples(nzcompPhyseq), "\n")
  
  # Compute distances
  print("Unifrac...")
  set.seed(123) # for unifrac, need to set a seed
  glom.UniF <- UniFrac(relPhyseq, weighted=TRUE, normalized=TRUE) # weighted unifrac
  print("Aitchison...")
  glom.ait <- phyloseq::distance(clrPhyseq, method = 'euclidean') # aitchison
  print("Bray des bois...")
  glom.bray <- phyloseq::distance(csnPhyseq, method = "bray") # bray-curtis
  print("Canberra <3...")
  glom.can <- phyloseq::distance(nzcompPhyseq, method = "canberra") # canberra
  
  dist.list <- list("UniF" = glom.UniF, "Ait" = glom.ait, "Canb" = glom.can, "Bray" = glom.bray)
  
  return(dist.list)
}


#____________________________________________________________________________________
# Plot in 2D the distances
plotDistances2D <- function(dlist, ordination="MDS", relPhyseq=physeq.rel, clrPhyseq=physeq.clr, csnPhyseq=physeq.CSN, nzcompPhyseq=physeq.NZcomp){
  plist <- NULL
  plist <- vector("list", 4)
  names(plist) <- c("Weighted Unifrac", "Aitchison", "Bray-Curtis", "Canberra")
  
  print("Unifrac")
  # Weighted UniFrac
  set.seed(123)
  iMDS.UniF <- ordinate(relPhyseq, ordination, distance=dlist$UniF)
  plist[[1]] <- plot_ordination(relPhyseq, iMDS.UniF, color="host_disease")
  
  print("Aitchison")
  # Aitchison
  set.seed(123)
  iMDS.Ait <- ordinate(clrPhyseq, ordination, distance=dlist$Ait)
  plist[[2]] <- plot_ordination(clrPhyseq, iMDS.Ait, color="host_disease")
  
  print("Bray")
  # Bray-Curtis
  set.seed(123)
  iMDS.Bray <- ordinate(csnPhyseq, ordination, distance=dlist$Bray)
  plist[[3]] <- plot_ordination(csnPhyseq, iMDS.Bray, color="host_disease")
  
  print("Canberra")
  # Canberra
  set.seed(123)
  iMDS.Can <- ordinate(nzcompPhyseq, ordination, distance=dlist$Can)
  plist[[4]] <- plot_ordination(nzcompPhyseq, iMDS.Can, color="host_disease")
  
  # Creating a dataframe to plot everything
  plot.df = plyr::ldply(plist, function(x) x$data)
  names(plot.df)[1] <- "distance"
  
  return(plot.df)
}

Now let’s plot!

a) Fecal samples

#________________
# FECAL DATA
#________________

# Get the distances & the plot data
dist.hugerth.fecal <- getDistances(relPhyseq = subset_samples(physeq.rel, sample_type=="stool"),
                                   clrPhyseq = subset_samples(physeq.clr, sample_type=="stool"),
                                   csnPhyseq = subset_samples(physeq.CSN, sample_type=="stool"),
                                   nzcompPhyseq = subset_samples(physeq.NZcomp, sample_type=="stool"))
## nb samples relPhyseq: 174 
## nb samples clrPhyseq: 174 
## nb samples csnPhyseq: 174 
## nb samples nzcompPhyseq: 174 
## [1] "Unifrac..."
## [1] "Aitchison..."
## [1] "Bray des bois..."
## [1] "Canberra <3..."
plot.df.fecal <- plotDistances2D(dlist=dist.hugerth.fecal,
                                 relPhyseq = subset_samples(physeq.rel, sample_type=="stool"),
                                 clrPhyseq = subset_samples(physeq.clr, sample_type=="stool"),
                                 csnPhyseq = subset_samples(physeq.CSN, sample_type=="stool"),
                                 nzcompPhyseq = subset_samples(physeq.NZcomp, sample_type=="stool"))
## [1] "Unifrac"
## [1] "Aitchison"
## [1] "Bray"
## [1] "Canberra"
# Plot
ggplot(plot.df.fecal, aes(Axis.1, Axis.2, color=host_disease))+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = c('blue', 'red'))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_text(size=20))+
  labs(color="Disease", title="Fecal samples")

# ggsave(file.path(path.plots, "distances4_MDS_stool.jpg"), height = 4, width = 15)

b) Sigmoid samples

#________________
# SIGMOID DATA
#________________

# Get the distances & the plot data
dist.hugerth.sigmoid <- getDistances(relPhyseq = subset_samples(physeq.rel, sample_type=="sigmoid"),
                                   clrPhyseq = subset_samples(physeq.clr, sample_type=="sigmoid"),
                                   csnPhyseq = subset_samples(physeq.CSN, sample_type=="sigmoid"),
                                   nzcompPhyseq = subset_samples(physeq.NZcomp, sample_type=="sigmoid"))
## nb samples relPhyseq: 351 
## nb samples clrPhyseq: 351 
## nb samples csnPhyseq: 351 
## nb samples nzcompPhyseq: 351 
## [1] "Unifrac..."
## [1] "Aitchison..."
## [1] "Bray des bois..."
## [1] "Canberra <3..."
plot.df.sigmoid <- plotDistances2D(dlist=dist.hugerth.sigmoid,
                                 relPhyseq = subset_samples(physeq.rel, sample_type=="sigmoid"),
                                 clrPhyseq = subset_samples(physeq.clr, sample_type=="sigmoid"),
                                 csnPhyseq = subset_samples(physeq.CSN, sample_type=="sigmoid"),
                                 nzcompPhyseq = subset_samples(physeq.NZcomp, sample_type=="sigmoid"))
## [1] "Unifrac"
## [1] "Aitchison"
## [1] "Bray"
## [1] "Canberra"
# Plot
ggplot(plot.df.sigmoid, aes(Axis.1, Axis.2, color=host_disease))+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = c('blue', 'red'))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_text(size=20))+
  labs(color="Disease", title="Sigmoid samples")

# ggsave(file.path(path.plots, "distances4_MDS_sigmoid.jpg"), height = 4, width = 15)

c) All samples

#________________
# ALL DATA
#________________

# Get the distances & the plot data
dist.hugerth <- getDistances()
## nb samples relPhyseq: 525 
## nb samples clrPhyseq: 525 
## nb samples csnPhyseq: 525 
## nb samples nzcompPhyseq: 525 
## [1] "Unifrac..."
## [1] "Aitchison..."
## [1] "Bray des bois..."
## [1] "Canberra <3..."
plot.df <- plotDistances2D(dlist=dist.hugerth)
## [1] "Unifrac"
## [1] "Aitchison"
## [1] "Bray"
## [1] "Canberra"
# Plot
p1 <- ggplot(plot.df, aes(Axis.1, Axis.2, color=host_disease))+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = c('blue', 'red'))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_text(size=20),
        axis.title.x = element_blank())+
  labs(color="Disease")
p2 <- ggplot(plot.df, aes(Axis.1, Axis.2, color=sample_type))+
  geom_line(aes(group=host_ID), color="black", lwd=0.1)+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = c('#33CC00', 'purple'))+
  facet_wrap(distance~., scales='free', nrow=1)+
  theme_bw()+
  theme(strip.text.x = element_blank())+
  labs(color="Sample type")
ggpubr::ggarrange(p1,p2, nrow=2)

# ggsave(file.path(path.plots, "distances4_MDS_all.jpg"), height = 8, width = 15)

2. Plot in 3D

For better visualization, we will also take a glance at reduction to 3D.

#____________________________________________________________________________________
# Plot 3D ordination
plotDistances3D <- function(d, name_dist, physeq=physeq.hugerth){
  
  # Reset parameters
  mds.3D <- NULL
  xyz <- NULL
  fig.3D <- NULL
  
  # Reduce distance matrix to 3 dimensions
  set.seed(123)
  mds.3D <- metaMDS(d, method="MDS", k=3, trace = 0)
  xyz <- scores(mds.3D, display="sites") # pull out the (x,y,z) coordinates
  
  # Plot
  fig.3D <- plot_ly(x=xyz[,1], y=xyz[,2], z=xyz[,3], type="scatter3d", mode="markers",
                    color=sample_data(physeq)$host_disease, colors = c("blue", "red"))%>%
    layout(title = paste('MDS in 3D with', name_dist, 'distance', sep = ' '))
  
  return(fig.3D)
}

Now let’s plot!

# Fecal samples
plotDistances3D(dist.hugerth.fecal$UniF, "UniFrac", subset_samples(physeq.hugerth, sample_type=="stool"))
plotDistances3D(dist.hugerth.fecal$Ait, "Aitchison", subset_samples(physeq.hugerth, sample_type=="stool"))
plotDistances3D(dist.hugerth.fecal$Canb, "Canberra", subset_samples(physeq.hugerth, sample_type=="stool"))
plotDistances3D(dist.hugerth.fecal$Bray, "Bray-Curtis", subset_samples(physeq.hugerth, sample_type=="stool"))
# Sigmoid samples
plotDistances3D(dist.hugerth.sigmoid$UniF, "UniFrac", subset_samples(physeq.hugerth, sample_type=="sigmoid"))
plotDistances3D(dist.hugerth.sigmoid$Ait, "Aitchison", subset_samples(physeq.hugerth, sample_type=="sigmoid"))
plotDistances3D(dist.hugerth.sigmoid$Canb, "Canberra", subset_samples(physeq.hugerth, sample_type=="sigmoid"))
plotDistances3D(dist.hugerth.sigmoid$Bray, "Bray-Curtis", subset_samples(physeq.hugerth, sample_type=="sigmoid"))

HIERARCHICAL CLUSTERING

# For heatmaps: have group color
matcol <- data.frame(phenotype = sample_data(physeq.hugerth)[,"host_disease"],
                     sample = sample_data(physeq.hugerth)[,"sample_type"])


# Function to get heatmap from the distances computed
plotHeatmaps <- function(dlist, fontsize){
  
  # Initialize variables
  i=1
  plist <- vector("list", 4)
  names(plist) <- names(dlist)
  
  # Loop through distances
  for(d in dlist){
    plist[[i]] <- pheatmap(as.matrix(d), 
                          clustering_distance_rows = d,
                          clustering_distance_cols = d,
                          fontsize = fontsize,
                          show_rownames = F,
                          show_colnames = F,
                          annotation_col = matcol,
                          # annotation_row = matcol,
                          annotation_colors = list(host_disease = c('Healthy' = 'blue', 'IBS' = 'red'),
                                                   sample_type = c("stool" = "#33CC00", 'sigmoid' = 'purple')),
                          cluster_rows = T,
                          cluster_cols = T,
                          clustering_method = 'complete', # hc method
                          main = names(dlist)[i]) # have name of distance as title
    i <- i+1
  }
  
  return(plist)
}


# Get the heatmaps
heatmp.hugerth <- plotHeatmaps(dlist = dist.hugerth, fontsize = 8)

REPRODUCE PLOTS FROM PAPER

Figure 3 - Comparison of the Bray-Curtis beta-diversity dispersion

#___________________________________________________________________
# FIGURE 3
library(reshape2)

# get useful metadata
metadata <- sample_data(physeq.hugerth) %>%
  as_tibble() %>%
  select(Run, host_disease, sample_type)

# Build dataframe with Bray-Curtis distance between IBS-HC, IBS-IBS, HC-HC
bc.comp <- melt(as.matrix(dist.hugerth$Bray), varnames = c("row", "col")) %>%
  inner_join(metadata, by=c("row"="Run")) %>%
  dplyr::rename(row_disease=host_disease, row_type=sample_type) %>%
  inner_join(metadata, by=c("col"="Run")) %>%
  dplyr::rename(col_disease=host_disease, col_type=sample_type) %>%
  # Keep only distances between samples from same sample_type
  filter(row_type == col_type) %>%
  select(-col_type) %>%
  dplyr::rename(sample_type=row_type) %>%
  relocate(row_disease, .after=sample_type) %>%
  # Classify comparison as between diseases, between Healthy, or between IBS samples
  mutate(compare=ifelse(row_disease != col_disease, "Between",
                        ifelse(row_disease == "Healthy", "Healthy",
                               ifelse(row_disease == "IBS", "IBS", "Unknown")))) %>%
  mutate(compare=factor(compare, levels=c("Between", "IBS", "Healthy")))

# Sanity check
bc.comp %>%
  dplyr::count(row_disease, col_disease, compare)
# Plot
ggplot(bc.comp, aes(x=compare, y=value))+
  facet_wrap(~sample_type, scales="free")+
  geom_violin()+
  geom_boxplot(width=0.4, outlier.shape = NA, notch=T)+
  geom_jitter(size=0.1, width=0.05)+
  theme_cowplot()+
  labs(x='', y='Bray-Curtis dissimilarity')

Figure 4 - NMDS highlighting IBS diagnosis or healthy

# Calculate NMDS coordinates
set.seed(123)
NMDS.Bray.fecal <- ordinate(physeq = subset_samples(physeq.CSN, sample_type=="stool"),
                            method = "NMDS",
                            distance = dist.hugerth.fecal$Bray)
## Run 0 stress 0.225931 
## Run 1 stress 0.2293622 
## Run 2 stress 0.2285311 
## Run 3 stress 0.2255545 
## ... New best solution
## ... Procrustes: rmse 0.01992115  max resid 0.1154476 
## Run 4 stress 0.2313091 
## Run 5 stress 0.2450373 
## Run 6 stress 0.235915 
## Run 7 stress 0.2255749 
## ... Procrustes: rmse 0.008081218  max resid 0.07390167 
## Run 8 stress 0.2257507 
## ... Procrustes: rmse 0.005270451  max resid 0.04131554 
## Run 9 stress 0.2256218 
## ... Procrustes: rmse 0.008101816  max resid 0.07879486 
## Run 10 stress 0.234097 
## Run 11 stress 0.2255944 
## ... Procrustes: rmse 0.00838837  max resid 0.06781654 
## Run 12 stress 0.2256381 
## ... Procrustes: rmse 0.00861187  max resid 0.07966027 
## Run 13 stress 0.2259211 
## ... Procrustes: rmse 0.01998228  max resid 0.1156725 
## Run 14 stress 0.226067 
## Run 15 stress 0.2371384 
## Run 16 stress 0.2262775 
## Run 17 stress 0.228454 
## Run 18 stress 0.2256025 
## ... Procrustes: rmse 0.002503418  max resid 0.02473633 
## Run 19 stress 0.2258836 
## ... Procrustes: rmse 0.02064241  max resid 0.1145083 
## Run 20 stress 0.2385753 
## *** No convergence -- monoMDS stopping criteria:
##      6: no. of iterations >= maxit
##     14: stress ratio > sratmax
NMDS.Bray.sigmoid <- ordinate(physeq = subset_samples(physeq.CSN, sample_type=="sigmoid"),
                              method = "NMDS",
                              distance = dist.hugerth.sigmoid$Bray)
## Run 0 stress 0.2593206 
## Run 1 stress 0.259414 
## ... Procrustes: rmse 0.01411048  max resid 0.1901508 
## Run 2 stress 0.2601868 
## Run 3 stress 0.2615324 
## Run 4 stress 0.2615583 
## Run 5 stress 0.2619001 
## Run 6 stress 0.2616245 
## Run 7 stress 0.2669166 
## Run 8 stress 0.2606921 
## Run 9 stress 0.2592282 
## ... New best solution
## ... Procrustes: rmse 0.01548516  max resid 0.2404092 
## Run 10 stress 0.2605332 
## Run 11 stress 0.2603586 
## Run 12 stress 0.2601874 
## Run 13 stress 0.259936 
## Run 14 stress 0.2605008 
## Run 15 stress 0.2601548 
## Run 16 stress 0.2613746 
## Run 17 stress 0.2622324 
## Run 18 stress 0.2670312 
## Run 19 stress 0.2606263 
## Run 20 stress 0.2606027 
## *** No convergence -- monoMDS stopping criteria:
##     12: no. of iterations >= maxit
##      8: stress ratio > sratmax
# Plot
p1 <- plot_ordination(subset_samples(physeq.CSN, sample_type=="stool"), NMDS.Bray.fecal, color="host_disease")+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = c('blue', 'red'))+
  theme_cowplot()+
  theme(legend.position = "none")+
  labs(title="Fecal samples")

p2 <- plot_ordination(subset_samples(physeq.CSN, sample_type=="sigmoid"), NMDS.Bray.sigmoid, color="host_disease")+
  geom_point(size=4, alpha=0.5)  + scale_color_manual(values = c('blue', 'red'))+
  theme_cowplot()+
  labs(color="Disease", title="Sigmoid samples")

ggpubr::ggarrange(p1,p2,ncol=2)

Figure 5 - relationship between samples from same individual

# Identify individuals that have both a sigmoid and a fecal sample
duo.samples <- sample_data(physeq.hugerth) %>%
  as.data.frame() %>%
  group_by(host_ID) %>%
  filter(n_distinct(sample_type)>1) %>%
  arrange(host_ID) %>%
  ungroup()
# Sanity checks
# head(duo.samples) # sanity check
# length(duo.samples$Run)


# Get Chao1 richness
chao1 <- plot_richness(subset_samples(physeq.hugerth, Run %in% duo.samples$Run),
                       measures="Chao1")
chao1.df <- chao1$data %>%
  select(Run, host_ID, sample_type, value) %>%
  arrange(host_ID) %>%
  pivot_wider(id_cols=host_ID, names_from=sample_type, values_from=value, values_fn=mean)

# Calculate NMDS coordinates
set.seed(123)
NMDS.Bray.comp <- ordinate(physeq = subset_samples(physeq.CSN, Run %in% duo.samples$Run),
                            method = "NMDS",
                            distance = "bray")
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.2014495 
## Run 1 stress 0.2004157 
## ... New best solution
## ... Procrustes: rmse 0.03447129  max resid 0.1815561 
## Run 2 stress 0.2004676 
## ... Procrustes: rmse 0.01962892  max resid 0.1205406 
## Run 3 stress 0.1962415 
## ... New best solution
## ... Procrustes: rmse 0.03437329  max resid 0.1844234 
## Run 4 stress 0.1992742 
## Run 5 stress 0.1990611 
## Run 6 stress 0.2041989 
## Run 7 stress 0.2051122 
## Run 8 stress 0.2033499 
## Run 9 stress 0.2018464 
## Run 10 stress 0.2008418 
## Run 11 stress 0.2066106 
## Run 12 stress 0.2029382 
## Run 13 stress 0.2038747 
## Run 14 stress 0.2033621 
## Run 15 stress 0.2038407 
## Run 16 stress 0.2031519 
## Run 17 stress 0.1971168 
## Run 18 stress 0.2044534 
## Run 19 stress 0.2012923 
## Run 20 stress 0.2016106 
## *** No convergence -- monoMDS stopping criteria:
##     11: no. of iterations >= maxit
##      9: stress ratio > sratmax
# Plot
p1 <- ggplot(chao1.df, aes(x=stool, y=sigmoid))+
  geom_point(size=3)+
  theme_cowplot()+
  labs(x="Stool Chao1", y="Sigmoid biopsy Chao1", title="A")

p2 <- plot_ordination(subset_samples(physeq.CSN, Run %in% duo.samples$Run), NMDS.Bray.comp, color="sample_type")+
  geom_line(aes(group=host_ID), color="grey", lwd=0.2)+
  geom_point(size=4, alpha=0.5, shape=1)  +
  scale_color_manual(values = c('orange', 'red'))+
  theme_cowplot()+
  labs(color="Sample type", title="B")

ggpubr::ggarrange(p1,p2,ncol=2, widths = c(4,5))